read
list_wd <- strsplit(getwd(), '/')[[1]]
if (list_wd[length(list_wd)] == 'hadaca3_framework') {
score_files <- list.files(path = "./output/scores/", full.names = TRUE)
} else {
# score_files <- list.files(pattern = 'score-li*', full.names = TRUE)
# score_files <- system("find . -maxdepth 1 -type f -name 'score-li*'", intern = TRUE)
score_files <- dir_ls(".", regexp = "score-li.*")
}
plan(multisession,workers=25)
# plan(sequential)
process_file <- function(score_file) {
base_name <- basename(score_file)
components <- str_match(base_name,
"score-li-(.+)_(.+)_mixRNA_(.+)_(.+)_RNA_(.+)_(.+)_scRNA_(.+)_(.+)_(.+)_mixMET_(.+)_(.+)_MET_(.+)_(.+)_(.+)_(.+).h5")[2:16]
# If file name doesn't match expected pattern, skip
if (any(is.na(components))) return(NULL)
scores <- tryCatch({
s <- read_hdf5(score_file)
gc()
s
}, error = function(e) {
message("Error reading file: ", score_file)
message(e)
NULL
})
# scores <- tryCatch({
# read_hdf5(score_file)
# }, error = function(e) return(NULL))
if (is.null(scores)) return(NULL)
cbind(
data.frame(
dataset = components[1],
ref = components[2],
preprocessing_mixRNA = components[3],
feature_selection_mixRNA = components[4],
preprocessing_RNA = components[5],
feature_selection_RNA = components[6],
preprocessing_scRNA = components[7],
feature_selection_scRNA = components[8],
deconvolution_rna = components[9],
preprocessing_mixMET = components[10],
feature_selection_mixMET = components[11],
preprocessing_MET = components[12],
feature_selection_MET = components[13],
deconvolution_met = components[14],
late_integration = components[15],
stringsAsFactors = FALSE
),
scores
)
}
# Process files in parallel
# results_list <- lapply(score_files, process_file)
results_list <- future_map(score_files, function(f) {
tryCatch(process_file(f), error = function(e) NULL)
})
# bind rows
results_li <- do.call(rbind, results_list)
results_li %>%
# filter(dc==2) %>%
group_by(late_integration) %>%
summarise(GlobalScore = median(score_aggreg)) %>%
arrange(desc(GlobalScore))
#> # A tibble: 5 × 2
#> late_integration GlobalScore
#> <chr> <dbl>
#> 1 OnlyMet 0.671
#> 2 limeanRMSE 0.671
#> 3 limean 0.657
#> 4 liCtSens 0.657
#> 5 OnlyRna 0.542
results_li_arrange = results_li %>%
group_by(preprocessing_mixRNA, feature_selection_mixRNA,
preprocessing_RNA, feature_selection_RNA,
preprocessing_scRNA, feature_selection_scRNA, deconvolution_rna,
preprocessing_mixMET,feature_selection_mixMET,
preprocessing_MET, feature_selection_MET, deconvolution_met,
late_integration, .groups = "keep") %>%
summarise(GlobalScore = median(score_aggreg)) %>%
arrange(desc(GlobalScore))
#> `summarise()` has grouped output by 'preprocessing_mixRNA',
#> 'feature_selection_mixRNA', 'preprocessing_RNA', 'feature_selection_RNA',
#> 'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
#> 'preprocessing_mixMET', 'feature_selection_mixMET', 'preprocessing_MET',
#> 'feature_selection_MET', 'deconvolution_met', 'late_integration'. You can
#> override using the `.groups` argument.
# Optional: reorder factors
all_data_used <- c('dataset', 'ref')
for (data_used in all_data_used) {
results_li[[data_used]] <- factor(results_li[[data_used]], levels = unique(results_li[[data_used]]))
}
# Optional: order other factors based on performance on 'invitro1'
if ("invitro1" %in% results_li$dataset) {
all_functions_li <- c(
'preprocessing_mixRNA', 'feature_selection_mixRNA',
'preprocessing_RNA', 'feature_selection_RNA',
'preprocessing_scRNA', 'feature_selection_scRNA', 'deconvolution_rna',
'preprocessing_mixMET', 'feature_selection_mixMET',
'preprocessing_MET', 'feature_selection_MET', 'deconvolution_met',
'late_integration'
)
for (fun in all_functions_li) {
results_li[[fun]] <- factor(results_li[[fun]],
levels = unique(results_li[[fun]][order(results_li$score_aggreg[results_li$dataset == 'invitro1'], decreasing = TRUE)]))
}
}
# Write compressed output
write.csv(results_li, file = gzfile("results_li.csv.gz"), row.names = FALSE)
index_aggreg <- which(names(results_li) == "score_aggreg")
#> Warning in instance$preRenderHook(instance): It seems your data is too big for
#> client-side DataTables. You may consider server-side processing:
#> https://rstudio.github.io/DT/server.html